\documentclass[11pt]{ctexart}  
\usepackage{ctex}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{hyperref}
% \usepackage[hidelinks]{hyperref} 去除超链接的红色框
\usepackage{setspace}
\usepackage{titlesec}
\usepackage{float} % 调用该包能够使用[H]
% \pagestyle{plain} % 去除页眉，但是保留页脚编号，都去掉plain换empty

% 更改脚注为圆圈
\usepackage{pifont}
\makeatletter
\newcommand*{\circnum}[1]{%
  \expandafter\@circnum\csname c@#1\endcsname
}
\newcommand*{\@circnum}[1]{%
  \ifnum#1<1 %
    \@ctrerr
  \else
    \ifnum#1>20 %
      \@ctrerr
    \else
      \ding{\the\numexpr 171+(#1)\relax}%
    \fi
  \fi
}
\makeatother

\renewcommand*{\thefootnote}{\circnum{footnote}}

\begin{document}
\tableofcontents % 目录，注意要运行两下或者vscode保存两下才能显示
% \singlespacing
\clearpage
\section{模版备用}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{算法}\footnotemark[1]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 测试
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{脚注}
\clearpage
\section{策略迭代算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{策略迭代算法}} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化状态价值函数$V(s)$和策略$\pi(s)$
		\STATE {\bfseries 策略估计：}
		\REPEAT
			\STATE $\Delta \leftarrow 0$
			\REPEAT
				\STATE $v \leftarrow V(s)$
				\STATE $V(s) \leftarrow \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, \pi(s)\right)\left[r+\gamma V\left(s^{\prime}\right)\right]$
				\STATE $\Delta \leftarrow \max (\Delta,|v-V(s)|)$
			\UNTIL{遍历所有的状态$s \in S$}
		\UNTIL{$\Delta < \theta$}
		\STATE {\bfseries 策略改进：}
		\STATE $stable\_flag \leftarrow true$
		\REPEAT
			\STATE 根据策略$\pi(a|s)$生成动作$a_{temp}$
			\STATE 更新策略：$\pi(a|s) \leftarrow \arg \max _a \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+\gamma V\left(s^{\prime}\right)\right]$
			\IF{$a_{temp} \neq \pi(a|s)$}
				\STATE 说明策略还未收敛，$stable\_flag \leftarrow false$
			\ENDIF
		\UNTIL{遍历所有的状态$s \in S$}
		\IF{$stable\_flag \leftarrow true$}
			\STATE 结束迭代并返回最优策略$\pi \approx \pi_*$和状态价值函数$V \approx V_*$
		\ELSE
			\STATE 继续执行策略估计·
		\ENDIF

	\end{algorithmic}
\end{algorithm}
\clearpage
\section{价值迭代算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{价值迭代算法}} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化一个很小的参数阈值$\theta>0$，以及状态价值函数$V(s)$，注意终止状态的$V(s_T)=0$
		\REPEAT
			\STATE $\Delta \leftarrow 0$
			\REPEAT
				\STATE $v \leftarrow V(s)$
				\STATE $V(s) \leftarrow \max _a \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+\gamma V\left(s^{\prime}\right)\right]$
				\STATE $\Delta \leftarrow \max (\Delta,|v-V(s)|)$
			\UNTIL{遍历所有的状态$s \in S$}
		\UNTIL{$\Delta < \theta$}
		\STATE 输出一个确定性策略$\pi \approx \pi_*$，\\ 且$\pi(s)=\arg \max _a \sum_{s^{\prime}, r} p\left(s^{\prime}, r \mid s, a\right)\left[r+\gamma V\left(s^{\prime}\right)\right]$
	\end{algorithmic}
\end{algorithm}
\clearpage
\section{首次访问蒙特卡洛算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{首次访问蒙特卡洛算法}} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化价值函数 $V(s)$，一个空的回报列表 $Returns(s_t)$
		\FOR {回合数 = $1,M$}
			\STATE 根据策略$\pi$采样一回合轨迹$\tau=\{s_0,a_0,r_1,\cdots,s_{T-1},a_{T-1},r_{T},\}$
			\STATE 初始化回报 $G \leftarrow 0$
			\FOR {时步 $t = T-1,,T-2,\cdots,0$}
				\STATE $G \leftarrow \gamma G + R_{t+1}$
				\REPEAT
					\STATE  将 $G$ 添加到 $Returns(s_t)$
					\STATE $V\left(S_t\right) \leftarrow \operatorname{average}\left(\operatorname{Returns}\left(S_t\right)\right)$
				\UNTIL{$s_t$ 第二次出现，即与历史某个状态$s_0,\cdots,s_{t-1}$相同}
			\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{脚注}
\clearpage
\section{Q learning算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{Q-learning算法}\footnotemark[1]}
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化Q表$Q(s,a)$为任意值，但其中$Q(s_{terminal},)=0$，即终止状态对应的Q值为0
		\FOR {回合数 = $1,M$}
			\STATE 重置环境，获得初始状态$s_1$
			\FOR {时步 = $1,T$}
				\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
				\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
				\STATE {\bfseries 更新策略：}
				\STATE $Q(s_t,a_t) \leftarrow Q(s_t,a_t)+\alpha[r_t+\gamma\max _{a}Q(s_{t+1},a)-Q(s_t,a_t)]$
				\STATE 更新状态$s_{t+1} \leftarrow s_t$
			\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{Sarsa算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{Sarsa算法}\footnotemark[1]}
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化Q表$Q(s,a)$为任意值，但其中$Q(s_{terminal},)=0$，即终止状态对应的Q值为0
		\FOR {回合数 = $1,M$}
			\STATE 重置环境，获得初始状态$s_1$
			\STATE 根据$\varepsilon-greedy$策略采样初始动作$a_1$
			\FOR {时步 = $1,t$}
				\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
				\STATE 根据$\varepsilon-greedy$策略$s_{t+1}$和采样动作$a_{t+1}$
				\STATE {\bfseries 更新策略：}
				\STATE $Q(s_t,a_t) \leftarrow Q(s_t,a_t)+\alpha[r_t+\gamma Q(s_{t+1},a_{t+1})-Q(s_t,a_t)]$
				\STATE 更新状态$s_{t+1} \leftarrow s_t$
				\STATE 更新动作$a_{t+1} \leftarrow a_t$
			\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage

\section{DQN算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{DQN算法}\footnotemark[1]}  
    \renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
    \renewcommand{\algorithmicrequire}{\textbf{输入:}}  
    \renewcommand{\algorithmicensure}{\textbf{输出:}} 
	\begin{algorithmic}[1]
		% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
		% \ENSURE $y = x^n$ % 输出
		\STATE 初始化策略网络参数$\theta$ % 初始化
		\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,M$}
			\STATE 重置环境，获得初始状态$s_t$
			\FOR {时步 = $1,t$}
				\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
				\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
				\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$中
				\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
				\STATE {\bfseries 更新策略：}
				\STATE 从$D$中采样一个batch的transition
				\STATE 计算实际的$Q$值，即$y_{j}$\footnotemark[2]
				\STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降\footnotemark[3]
			\ENDFOR
			\STATE 每$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Playing Atari with Deep Reinforcement Learning}
\footnotetext[2]{$y_{i}= \begin{cases}r_{i} & \text {对于终止状态} s_{i} \\ r_{i}+\gamma \max _{a^{\prime}} Q\left(s_{i+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{i}\end{cases}$}
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步，但没有每$C$个回合稳定}
\clearpage

\section{DRQN算法}
\begin{algorithm}[H] % [H]固定位置
	\floatname{algorithm}{{DRQN算法}\footnotemark[1]}  
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\renewcommand{\algorithmicrequire}{\textbf{输入:}}  
	\renewcommand{\algorithmicensure}{\textbf{输出:}} 
	\begin{algorithmic}[1]
		% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
		% \ENSURE $y = x^n$ % 输出
		\STATE 初始化策略网络参数$\theta$ % 初始化
		\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,M$}
		\STATE 重置环境,获得初始状态的观测$o_t$
		\STATE $h_{0} \leftarrow 0$
		\FOR {时步 = $1,t$}
		\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
		\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态，生成下一状态的观测$o_{t+1}$
		\STATE 存储transition即$(o_t,a_t,r_t,o_{t+1})$到经验回放$D$中
		\STATE 更新环境状态对应的观测$o_{t+1} \leftarrow o_t $
		\STATE {\bfseries 更新策略：}
		% \IF{回合数 \% freeze internal == 0} 
		% \STATE 更新参数$\hat{Q} \leftarrow Q$
		% \ENDIF
		\STATE 从$D$中采样一个batch的transition, 即 
		\\ $B = \left\{(s_{j}, a_{j}, r_{j}, s_{j}^{'}) \dots (s_{j+\tau}, a_{j+\tau}, r_{j+\tau}, s_{j+\tau}^{'})\right\}_{j=1}^{\text {batch size }} \subseteq D$
	
		\FOR{这个batch中的每个transition}
		\STATE $h_{j-1} \leftarrow 0$
		\FOR{$k = j \text { to } k = j + \tau $}
		\STATE 更新LSTM网络的隐藏状态 $h_{k} = Q(s_{k}, h_{k-1} | \theta_{i})$
		\ENDFOR
		\STATE 计算实际的$Q$值，即$y_{j}$\footnotemark[2]
		\STATE 计算损失 $L(\theta)=\left(y_{i}-Q\left(s_{j+\tau}, a_{j+\tau}, h_{j+\tau-1} ; \theta\right)\right)^{2}$
		\ENDFOR
		\STATE 关于参数$\theta$做随机梯度下降\footnotemark[3]
		\STATE 每$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]]
		\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Deep Recurrent Q-Learning for Partially Observable MDPs}
\footnotetext[2]{$y_{j}= \begin{cases}r_{j} & \text {对于终止状态} s_{i+1} \\ r_{j}+\gamma \max _{a^{\prime}} Q\left(s_{j+\tau}, a_{j+\tau}, h_{j+\tau-1} ; \theta\right) & \text {对于非终止状态} s_{i+1}\end{cases}$}
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
\clearpage


\section{PER-DQN算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{PER-DQN算法}\footnotemark[1]}  
    \renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
    \renewcommand{\algorithmicrequire}{\textbf{输入:}}  
    \renewcommand{\algorithmicensure}{\textbf{输出:}} 
	\begin{algorithmic}[1]
		% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
		% \ENSURE $y = x^n$ % 输出
		\STATE 初始化当前网络参数$\theta$ % 初始化
		\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,M$}
			\STATE 重置环境，获得初始状态$s_t$
			\FOR {时步 = $1,t$}
				\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
				\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
				\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$，并根据TD-error损失确定其优先级$p_t$
				\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
				\STATE {\bfseries 更新策略：}
				\STATE 按照经验回放中的优先级别，每个样本采样概率为$P(j)=p_j^\alpha / \sum_i p_i^\alpha$，从$D$中采样一个大小为batch的transition
				\STATE 计算各个样本重要性采样权重 $w_j=(N \cdot P(j))^{-\beta} / \max _i w_i$
				\STATE 计算TD-error $\delta_j$ ; 并根据TD-error更新优先级$p_j$
				\STATE 计算实际的$Q$值，即$y_{j}$\footnotemark[2]
				\STATE 根据重要性采样权重调整损失 $L(\theta)=\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\cdot w_j \right)^{2}$，并将其关于参数$\theta$做随机梯度下降\footnotemark[3]
			\ENDFOR
			\STATE 每$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Playing Atari with Deep Reinforcement Learning}
\footnotetext[2]{$y_{i}= \begin{cases}r_{i} & \text {对于终止状态} s_{i+1} \\ r_{i}+\gamma \max _{a^{\prime}} Q\left(s_{i+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{i+1}\end{cases}$}
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步，但没有每$C$个回合稳定}
\clearpage


\section{Policy Gradient算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{REINFORCE算法：Monte-Carlo Policy Gradient}\footnotemark[1]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
		\FOR {回合数 = $1,M$}
			\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
			\FOR {时步 = $0,1,2,...,T-1$}
				\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
				\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
			\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{Advantage Actor Critic算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{Q Actor Critic算法}} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化Actor参数$\theta$和Critic参数$w$
		\FOR {回合数 = $1,M$}
			\STATE 根据策略$\pi_{\theta}(a|s)$采样一个(或几个)回合的transition
			\STATE  {\bfseries 更新Critic参数\footnotemark[1]}
			\FOR {时步 = $t+1,1$}
				\STATE 计算Advantage，即$ \delta_t = r_t + \gamma Q_w(s_{t+1},a_{t+1})-Q_w(s_t,a_t)$
				\STATE $w \leftarrow w+\alpha_{w} \delta_{t} \nabla_{w} Q_w(s_t,a_t)$
				\STATE $a_t \leftarrow a_{t+1}$,$s_t \leftarrow s_{t+1}$
			\ENDFOR
			\STATE 更新Actor参数$\theta \leftarrow \theta+\alpha_{\theta} Q_{w}(s, a) \nabla_{\theta} \log \pi_{\theta}(a \mid s)$
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{这里结合TD error的特性按照从$t+1$到$1$计算法Advantage更方便}
\clearpage

\section{PPO-Clip算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{PPO-Clip算法}\footnotemark[1]\footnotemark[2]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化策略网络(Actor)参数$\theta$和价值网络(Critic)参数$\phi$
		\STATE 初始化Clip参数$\epsilon$
		\STATE 初始化epoch数$K$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,2,\cdots,M$}
			\STATE 根据策略$\pi_{\theta}$采样$C$个时步数据,收集轨迹$\tau = {s_0,a_0,r_1,...,s_t,a_t,r_{t+1},\cdots}$到经验回放$D$中
			\FOR {epoch数 $ k = 1,2,\cdots,K$}
				\STATE 计算折扣奖励$\hat{R_t}$
				\STATE 计算优势函数，即$A^{\pi_{\theta_{k}}}=V_{\phi_k}-\hat{R_t}$
				\STATE 结合重要性采样计算Actor损失，如下：
				\STATE $L^{CLIP}(\theta)= \frac{1}{|D_k|T}\sum_{\tau \in D_k}\sum_{t=0}^{T} min(\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{k}}(a_t|s_t)} A^{\pi_{\theta_{k}}}(s_t,a_t), g(\epsilon,A^{\pi_{\theta_{k}}}(s_t,a_t)))$\footnotemark[3]
				\STATE 梯度下降更新Actor参数：$\theta_{k+1} \leftarrow \theta_{k} + \alpha_{\theta} L^{CLIP}(\theta)$
				\STATE 更新Critic参数:
				\STATE $\phi_{k+1} \leftarrow \phi_{k}+ \alpha_{\phi}\frac{1}{|D_k|T}\sum_{\tau \in D_k}\sum_{t=0}^{T} (V_{\phi_{k}}(s_t)-\hat{R_t})^2$
			\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Proximal Policy Optimization Algorithms}
\footnotetext[2]{https://spinningup.openai.com/en/latest/algorithms/ppo.html}
\footnotetext[3]{$L^{CLIP}(\theta)=\hat{E_t}[min(r_t(\theta)\hat{A_t}, clip(r_t(\theta), 1-\epsilon, 1+\epsilon)\hat{A_t})]$}
\clearpage

\section{PPO-KL散度算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{PPO-KL散度算法}\footnotemark[1]\footnotemark[2]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化策略网络(Actor)参数$\theta$和价值网络(Critic)参数$\phi$
		\STATE 初始化KL散度参数$\lambda$
		\STATE 初始化回合数量$M$
		\STATE 初始化epoch数量$K$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,2,\cdots,M$}
			\STATE 根据策略$\pi_{\theta_m}$采样一个或几个回合数据,收集($s_t,a_t,r_t$)到经验回放$D_m=\{\tau_i\}$中
			\FOR {epoch数 = $1,2,\cdots,K$}
				\STATE 计算折扣奖励$\hat{R_t}$
				\STATE 根据值函数$V_{\Phi_m}$,用某种优势估计方法计算优势函数$\hat{A_t}$
				\STATE 通过最大化目标函数$J_{PPO}(\theta)$更新参数$\theta$：
				\STATE $J_{PPO}(\theta)=\sum_{t=1}^{T}\frac{\pi_\theta(a_t|s_t)}{\pi_{old}(a_t|s_t)}\hat{A_t}-\lambda KL[\pi_{old}|\pi_\theta]$
				\STATE 典型方法是Adam随机梯度上升
				\STATE 根据均方误差回归拟合值函数，更新Critic参数:
				\STATE $\Phi_{m+1} \leftarrow \frac{1}{|D_m|T}\sum_{\tau \in D_m}\sum_{t=0}^{T} (V_{\Phi_{m}}(s_t)-\hat{R_t})^2$
				\STATE 运用某些梯度下降算法
				\IF{$KL[\pi_{old}|\pi_\theta]>\beta_{high}KL_{target}$}
					\STATE $\lambda \leftarrow \alpha\lambda$
				\ELSIF{$KL[\pi_{old}|\pi_\theta]<\beta_{low}KL_{target}$}
					\STATE $\lambda \leftarrow \frac{\lambda}{\alpha}$
				\ENDIF
			\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Proximal Policy Optimization Algorithms}
\footnotetext[2]{Emergence of Locomotion Behaviours in Rich Environments}

\clearpage
\section{DDPG算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{DDPG算法}\footnotemark[1]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化critic网络$Q\left(s, a \mid \theta^Q\right)$和actor网络$\mu(s|\theta^{\mu})$的参数$\theta^Q$和$\theta^{\mu}$
		\STATE 初始化对应的目标网络参数，即$\theta^{Q^{\prime}} \leftarrow \theta^Q, \theta^{\mu^{\prime}} \leftarrow \theta^\mu$
		\STATE 初始化经验回放$R$
		\FOR {回合数 = $1,M$}
			\STATE 选择动作$a_t=\mu\left(s_t \mid \theta^\mu\right)+\mathcal{N}_t$，$\mathcal{N}_t$为探索噪声
			\STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$
			\STATE 存储transition$(s_t,a_t,r_t,s_{t+1})$到经验回放$R$中
			\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
			\STATE {\bfseries 更新策略：}
			\STATE 从$R$中取出一个随机批量的$(s_i,a_i,r_i,s_{i+1})$
			\STATE 求得$y_i=r_i+\gamma Q^{\prime}\left(s_{i+1}, \mu^{\prime}\left(s_{i+1} \mid \theta^{\mu^{\prime}}\right) \mid \theta^{Q^{\prime}}\right)$
			\STATE 更新critic参数，其损失为：$L=\frac{1}{N} \sum_i\left(y_i-Q\left(s_i, a_i \mid \theta^Q\right)\right)^2$
			\STATE 更新actor参数：$\left.\left.\nabla_{\theta^\mu} J \approx \frac{1}{N} \sum_i \nabla_a Q\left(s, a \mid \theta^Q\right)\right|_{s=s_i, a=\mu\left(s_i\right)} \nabla_{\theta^\mu} \mu\left(s \mid \theta^\mu\right)\right|_{s_i}$
			\STATE 软更新目标网络：$\theta^{Q^{\prime}} \leftarrow \tau \theta^Q+(1-\tau) \theta^{Q^{\prime}}$，
			$\theta^{\mu^{\prime}} \leftarrow \tau \theta^\mu+(1-\tau) \theta^{\mu^{\prime}}$
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Continuous control with deep reinforcement learning}
\clearpage
\section{SoftQ算法}
\begin{algorithm}[H]
    \floatname{algorithm}{{SoftQ算法}}  
    \renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{}  
	\begin{algorithmic}[1]
		\STATE 初始化参数$\theta$和$\phi$% 初始化
		\STATE 复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,M$}
			\FOR {时步 = $1,t$}
				\STATE 根据$\mathbf{a}_{t} \leftarrow f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)$采样动作，其中$\xi \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$
				\STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$
				\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$中
				\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
				\STATE {\bfseries 更新soft Q函数参数：}
				\STATE 对于每个$s^{(i)}_{t+1}$采样$\left\{\mathbf{a}^{(i, j)}\right\}_{j=0}^{M} \sim q_{\mathbf{a}^{\prime}}$
				\STATE 计算empirical soft values $V_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}\right)$\footnotemark[1]
				\STATE 计算empirical gradient $J_{Q}(\theta)$\footnotemark[2]
				\STATE 根据$J_{Q}(\theta)$使用ADAM更新参数$\theta$
				\STATE {\bfseries 更新策略：}
				\STATE  对于每个$s^{(i)}_{t}$采样$\left\{\xi^{(i, j)}\right\}_{j=0}^{M} \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$
				\STATE 计算$\mathbf{a}_{t}^{(i, j)}=f^{\phi}\left(\xi^{(i, j)}, \mathbf{s}_{t}^{(i)}\right)$
				\STATE 使用经验估计计算$\Delta f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)$\footnotemark[3]
				\STATE 计算经验估计$\frac{\partial J_{\pi}\left(\phi ; \mathbf{s}_{t}\right)}{\partial \phi} \propto \mathbb{E}_{\xi}\left[\Delta f^{\phi}\left(\xi ; \mathbf{s}_{t}\right) \frac{\partial f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)}{\partial \phi}\right]$，即$\hat{\nabla}_{\phi} J_{\pi}$
				\STATE 根据$\hat{\nabla}_{\phi} J_{\pi}$使用ADAM更新参数$\phi$
				\STATE 
			\ENDFOR
			\STATE 每$C$个回合复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$
		\ENDFOR	
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{$V_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}\right)=\alpha \log \mathbb{E}_{q_{\mathbf{a}^{\prime}}}\left[\frac{\exp \left(\frac{1}{\alpha} Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}^{\prime}\right)\right)}{q_{\mathbf{a}^{\prime}}\left(\mathbf{a}^{\prime}\right)}\right]$}
\footnotetext[2]{$J_{Q}(\theta)=\mathbb{E}_{\mathbf{s}_{t} \sim q_{\mathbf{s}_{t}}, \mathbf{a}_{t} \sim q_{\mathbf{a}_{t}}}\left[\frac{1}{2}\left(\hat{Q}_{\mathrm{soft}}^{\bar{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right)^{2}\right]$}
\footnotetext[3]{$\begin{aligned} \Delta f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)=& \mathbb{E}_{\mathbf{a}_{t} \sim \pi^{\phi}}\left[\left.\kappa\left(\mathbf{a}_{t}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right) \nabla_{\mathbf{a}^{\prime}} Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}^{\prime}\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right.\\ &\left.+\left.\alpha \nabla_{\mathbf{a}^{\prime}} \kappa\left(\mathbf{a}^{\prime}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right] \end{aligned}$}
\clearpage
\section{SAC-S算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{SAC-S算法}\footnotemark[1]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化参数$\psi, \bar{\psi}, \theta, \phi$
		\FOR {回合数 = $1,M$}
			\FOR {时步 = $1,t$}
				\STATE 根据$\boldsymbol{a}_{t} \sim \pi_{\phi}\left(\boldsymbol{a}_{t} \mid \mathbf{s}_{t}\right)$采样动作$a_t$
				\STATE 环境反馈奖励和下一个状态，$\mathbf{s}_{t+1} \sim p\left(\mathbf{s}_{t+1} \mid \mathbf{s}_{t}, \mathbf{a}_{t}\right)$
				\STATE 存储transition到经验回放中，$\mathcal{D} \leftarrow \mathcal{D} \cup\left\{\left(\mathbf{s}_{t}, \mathbf{a}_{t}, r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right), \mathbf{s}_{t+1}\right)\right\}$
				\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
				\STATE {\bfseries 更新策略：}
				\STATE $\psi \leftarrow \psi-\lambda_{V} \hat{\nabla}_{\psi} J_{V}(\psi)$
				\STATE $\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$
				\STATE $\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$
				\STATE $\bar{\psi} \leftarrow \tau \psi+(1-\tau) \bar{\psi}$
			\ENDFOR
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor}
\clearpage
\section{SAC算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{SAC算法}\footnotemark[1]}  
    \renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1]
		\STATE 初始化网络参数$\theta_1,\theta_2$以及$\phi$ % 初始化
		\STATE 复制参数到目标网络$\bar{\theta_1} \leftarrow \theta_1,\bar{\theta_2} \leftarrow \theta_2,$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,M$}
			\STATE 重置环境，获得初始状态$s_t$
			\FOR {时步 = $1,t$}
				\STATE 根据$\boldsymbol{a}_{t} \sim \pi_{\phi}\left(\boldsymbol{a}_{t} \mid \mathbf{s}_{t}\right)$采样动作$a_t$
				\STATE 环境反馈奖励和下一个状态，$\mathbf{s}_{t+1} \sim p\left(\mathbf{s}_{t+1} \mid \mathbf{s}_{t}, \mathbf{a}_{t}\right)$
				\STATE 存储transition到经验回放中，$\mathcal{D} \leftarrow \mathcal{D} \cup\left\{\left(\mathbf{s}_{t}, \mathbf{a}_{t}, r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right), \mathbf{s}_{t+1}\right)\right\}$
				\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
				\STATE {\bfseries 更新策略：}
				\STATE 更新$Q$函数，$\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$\footnotemark[2]\footnotemark[3]
				\STATE 更新策略权重，$\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$ \footnotemark[4]
				\STATE 调整temperature，$\alpha \leftarrow \alpha-\lambda \hat{\nabla}_{\alpha} J(\alpha)$ \footnotemark[5]
				\STATE 更新目标网络权重，$\bar{\theta}_{i} \leftarrow \tau \theta_{i}+(1-\tau) \bar{\theta}_{i}$ for $i \in\{1,2\}$
			\ENDFOR
		\ENDFOR
	\end{algorithmic}	
\end{algorithm}
\footnotetext[2]{Soft Actor-Critic Algorithms and Applications}
\footnotetext[2]{$J_{Q}(\theta)=\mathbb{E}_{\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right) \sim \mathcal{D}}\left[\frac{1}{2}\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma \mathbb{E}_{\mathbf{s}_{t+1} \sim p}\left[V_{\bar{\theta}}\left(\mathbf{s}_{t+1}\right)\right]\right)\right)^{2}\right]$}
% \footnotetext[3]{$\hat{\nabla}_{\theta} J_{Q}(\theta)=\nabla_{\theta} Q_{\theta}\left(\mathbf{a}_{t}, \mathbf{s}_{t}\right)\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}), \mathbf{a}_{t}\right)+\gamma\left(Q_{\bar{\theta}}\left(\mathbf{s}_{t+1}, \mathbf{a}_{t+1}\right)-\alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t+1} \mid \mathbf{s}_{t+1}\right)\right)\right)\right)\right.$}
% \footnotetext[4]{$\hat{\nabla}_{\phi} J_{\pi}(\phi)=\nabla_{\phi} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)+\left(\nabla_{\mathbf{a}_{t}} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)-\nabla_{\mathbf{a}_{t}} Q\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right) \nabla_{\phi} f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$,$\mathbf{a}_{t}=f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$}
% \footnotetext[5]{$J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right]$}
\clearpage


\section{GAIL算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{GAIL算法}\footnotemark[1]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 采样专家轨迹$\tau _{E} \sim \pi _{E}$，初始化网络模型参数$\theta _{0}$和判别器$D$参数$\omega _{0}$
		\FOR {回合数 $i = 1,2,\cdots $}
			\STATE 采样策略轨迹$\tau_{i} \sim \pi _{\theta _{i}}$
			\STATE 使用梯度下降更新判别器$D$的参数$\omega _{i}$，梯度为：
			\begin{center}
				\begin{equation}
				\hat{\mathbb{E}}_{\tau_{i}}\left[\nabla_{w} \log \left(D_{w}(s, a)\right)\right]+\hat{\mathbb{E}}_{\tau_{E}}\left[\nabla_{w} \log \left(1-D_{w}(s, a)\right)\right]
				\end{equation}
			\end{center}
			\STATE 使用判别器$D$对策略轨迹$\tau_{i}$的输出作为奖励更新策略$\pi _{\theta _{i}}$\footnotemark[2]
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Generative Adversarial Imitation Learning}
\footnotetext[2]{策略更新方式与策略模型$\pi _{\theta}$有关，如PP0-Clip等.}
\clearpage

\section{MAPPO算法}
\begin{algorithm}[H] % [H]固定位置
    \floatname{algorithm}{{MAPPO算法}\footnotemark[1]\footnotemark[2]} 
	\renewcommand{\thealgorithm}{} % 去掉算法标号
	\caption{} 
	\begin{algorithmic}[1] % [1]显示步数
		\STATE 初始化每个智能体$u$的Critic网络$Q_{\phi^u}$和参数为$\theta^u$的Actor网络，$u \in U$
		\STATE 初始化每个智能体$u$的目标Actor网络$\pi_{old}^u$的参数$\theta_{old}^u\leftarrow\theta^u$和目标Critic网络$Q_{\overline{\phi}^u}$的参数$\overline{\phi}^u\leftarrow\phi^u$
		\STATE 初始化epoch数$K$
		\STATE 初始化经验回放$D$
		\FOR {回合数 = $1,2,\cdots,M$}
			\STATE 初始化状态$s_1$
			\STATE 每个智能体$u$都根据各自策略采样$C$个时步数据,收集轨迹$\tau^u = {\{o_t^u, a_t^u, r_{t+1}}\}_{t=1}^T$\footnotemark[3]
			\STATE 对每个时步的每条轨迹
			\STATE 计算折扣奖励$\{\hat{R}_t^u\}_{t=1}^T$
			\STATE 计算优势函数$\{A_t^u=V_{\phi_t}^u-\hat{R}_t^u\}_{t=1}^T$
			\STATE 计算$y_t^u=V_{\phi_t}^u+A_t^u$
			\STATE 将每个时步的数据$\{[o_t^u, a_t^u, y_t^u, A_t^u]_{u=1}^U\}_{t=1}^T$ 都存储到经验回放$D$中
			\FOR {epoch数 $ k = 1,2,\cdots,K$}
				\STATE 打乱$D$中数据顺序并重新编号
				\FOR {$j=0,1,\cdots,\frac{T}{B}-1$}
					\STATE 选择$B$条数据$\{o_i^u, a_i^u, y_i^u, A_i^u\}_{i=1+Bj}^{B(j+1)}$
					\STATE 计算梯度：
					\STATE $\bigtriangleup\theta^u=\frac{1}{B}\sum_{i=1}^B\{\bigtriangledown_{\theta^u}f(r_i(\theta^u),A_i^u)\}$\footnotetext[4]
					
					\STATE $\bigtriangleup\phi^u=\frac{1}{B}\sum_{i=1}^B\{\bigtriangledown_{\phi^u}(y_i^u-V_{\phi^u}(o_i^u))^2\}$
					\STATE Adam梯度上升方法计算$\theta^u$，Adam梯度下降方法计算$\phi^u$
				\ENDFOR
			\ENDFOR
		\STATE 更新$\theta_{old}^u\leftarrow\theta^u$, $\overline{\phi}^u\leftarrow\phi^u$
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Joint Optimization of Handover Control and Power Allocation Based on Multi-Agent Deep Reinforcement Learning}
\footnotetext[2]{The Surprising Effectiveness of PPO in Cooperative Multi-Agent Games}
\footnotetext[3]{$o_t^u$是智能体$u$在时刻$t$的观测值，是局部观测值，而$s_t$是全局状态}
\footnotetext[4]{函数f是目标函数，不同的方法如Clip或者KL散度等目标函数不同}
\clearpage

\end{document}